This notebook explores the ABM generated data of different scenarios and replications.
import mesa
import numpy as np
import random
import networkx as nx
import bisect
from pathlib import Path
import os
import sys
import csv
import pandas as pd
# Basic paths
src_path = str(Path.cwd().parent / "scripts")
sys.path.append(src_path)
project_path = Path().resolve().parent
csv_path = project_path / "data"
# Python file stored in script folder
from data_cleaning import *
🔹 Useful for step-wise analyses. \ 🔹 All the agents' information at each step.
'''
Only Scenario 1
'''
num_reps = 10
rdfs = []
for i in range(1, num_reps+1):
file_name = f"scenario1_rep{i}.csv"
rdf = pd.read_csv(csv_path / file_name, na_filter=True)
rdfs.append(rdf)
rdfs[0]
| Step | AgentID | Age | Education | Gender | Income | Spouse | Parents | Generation | Cohort | Children | Weight | Capital | Brut | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 22 | 4 | M | 383.912 | NaN | [] | 1 | 2 | [525, 722] | [4, 9, 6] | [0.56, 0.456, 0] | NaN |
| 1 | 0 | 1 | 59 | 2 | F | 572.317 | NaN | [] | 1 | 1 | [] | [3, 7, 9] | [0.26, 0.608, 0] | NaN |
| 2 | 0 | 2 | 46 | 2 | F | 207.210 | NaN | [] | 1 | 1 | [] | [3, 4, 7] | [0.26, 0.181, 0] | NaN |
| 3 | 0 | 3 | 31 | 1 | M | 524.950 | NaN | [] | 1 | 2 | [] | [1, 7, 8] | [0.11, 0.558, 0] | NaN |
| 4 | 0 | 4 | 30 | 3 | F | 208.252 | NaN | [] | 1 | 2 | [633, 668] | [7, 5, 4] | [0.51, 0.195, 0] | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 148686 | 499 | 5328 | 10 | 4 | M | 679.790 | NaN | [5301, 5276] | 53 | 55 | [] | [7, 7, 1] | [0.68, 0.722, 0.97] | 32.0 |
| 148687 | 499 | 5329 | 10 | 5 | M | 773.909 | NaN | [5303, 5293] | 53 | 55 | [] | [4, 3, 1] | [0.7, 0.629, 0.912] | 31.0 |
| 148688 | 499 | 5330 | 8 | 4 | M | 782.493 | NaN | [5303, 5293] | 53 | 55 | [] | [1, 4, 1] | [0.44, 0.685, 0.97] | 32.0 |
| 148689 | 499 | 5331 | 8 | 5 | M | 702.780 | NaN | [5306, 5300] | 53 | 55 | [] | [2, 7, 3] | [0.6, 0.747, 0.97] | 32.0 |
| 148690 | 499 | 5332 | 6 | 5 | M | 393.503 | NaN | [5306, 5300] | 53 | 55 | [] | [7, 8, 1] | [0.85, 0.443, 0.971] | 33.0 |
148691 rows × 14 columns
🔹 Useful when a flattened data for each agent is required. \ 🔹 Here, you cannot analyze with regard to time steps.
"""
Run this only if you already have the data sets. Otherwise, next line.
"""
num_scen = 3
num_reps = 10
dfs = []
for j in range(1, num_scen+1):
for i in range(1, num_reps+1):
file_name = f"s{j}r{i}_analysis.csv"
df = pd.read_csv(csv_path/file_name, index_col=0)
df['Scenario'] = j
df['Replication'] = i
dfs.append(df)
"""
Run this only when data sets aren't provided. Warning: it takes long to run!!
"""
# num_scen = 3
# num_reps = 10
# dfs = []
# for j in range(1, num_scen+1):
# for i in range(1, num_reps+1):
# file_name = f"scenario{j}_rep{i}.csv"
# df = process_data(csv_path / file_name)
# df['Scenario'] = j
# df['Replication'] = i
# dfs.append(df)
dfs[10]
| AgentID | Step | Age | Edu_level | Gender | Income | Generation | Cohort | Weight | Capital | ... | Mother_Income | Father_Income | Mother_Cultural | Mother_Economic | Mother_Social | Father_Cultural | Father_Economic | Father_Social | Scenario | Replication | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 46 | 3 | F | 525.366 | 1 | 1 | [9, 6, 7] | ['0.57', '0.525', '0'] | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 | 1 |
| 1 | 1 | 20 | 58 | 4 | F | 552.580 | 1 | 3 | [4, 10, 9] | ['0.56', '0.691', '0.413'] | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 | 1 |
| 2 | 10 | 1 | 47 | 1 | F | 592.263 | 1 | 1 | [10, 1, 7] | ['0.2', '0.407', '0.904'] | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 | 1 |
| 3 | 100 | 1 | 51 | 2 | F | 434.857 | 1 | 1 | [6, 5, 4] | ['0.32', '0.408', '0.827'] | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2 | 1 |
| 4 | 1000 | 53 | 60 | 5 | F | 513.299 | 4 | 6 | [6, 8, 9] | ['0.8', '0.577', '0.942'] | ... | 721.298 | 753.854 | 0.60 | 0.586 | 0.828 | 0.55 | 0.565 | 0.697 | 2 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7232 | 995 | 54 | 62 | 4 | M | 614.759 | 4 | 6 | [3, 1, 9] | ['0.52', '0.423', '0.954'] | ... | 711.882 | 653.918 | 0.60 | 0.801 | 0.804 | 0.54 | 0.572 | 0.570 | 2 | 1 |
| 7233 | 996 | 58 | 70 | 3 | F | 201.748 | 4 | 6 | [7, 3, 7] | ['0.51', '0.164', '0.907'] | ... | 799.748 | 764.927 | 0.56 | 0.700 | 0.905 | 0.56 | 0.813 | 0.943 | 2 | 1 |
| 7234 | 997 | 54 | 62 | 5 | M | 775.373 | 5 | 6 | [7, 3, 1] | ['0.85', '0.63', '0.889'] | ... | 796.356 | 380.924 | 1.00 | 0.547 | 0.404 | 1.00 | 0.429 | 0.275 | 2 | 1 |
| 7235 | 998 | 55 | 64 | 4 | F | 484.957 | 5 | 6 | [9, 4, 1] | ['0.76', '0.424', '0.843'] | ... | 502.834 | 488.990 | 0.75 | 0.629 | 0.613 | 0.75 | 0.336 | 0.638 | 2 | 1 |
| 7236 | 999 | 54 | 62 | 3 | F | 682.823 | 4 | 6 | [3, 4, 8] | ['0.39', '0.597', '0.823'] | ... | 621.984 | 57.366 | 0.42 | 0.583 | 0.524 | 0.48 | 0.054 | 0.358 | 2 | 1 |
7237 rows × 36 columns
🔻 Save the cleaened/processed data as CSV files (To analyze in R for Relative Mobility)
for i in range(1, num_scen*num_reps+1):
df = dfs[i-1]
# print('Scenario', df['Scenario'][0], 'Replication', df['Replication'][0])
scen = df['Scenario'][0]
rep = df['Replication'][0]
df.to_csv(csv_path/f's{scen}r{rep}_analysis.csv')
The following two plots allow a primitive view on how the scenarios differ.
import plotly.graph_objects as go
# Create dictionary of dataframes
df_dict = {}
row=0
for j in range(1, num_scen+1):
for i in range(1, num_reps+1):
key = f"scenario {j}"
df = dfs[row]
df_mean = df.groupby(['Cohort'], as_index=False).Edu_level.mean()
if key in df_dict:
df_dict[key].append(df_mean)
else:
df_dict[key] = [df_mean]
row += 1
# Create list of replications and scenarios
replications = [f"rep {i}" for i in range(1, num_reps+1)]
scenarios = [f"scenario {j}" for j in range(1, num_scen+1)]
# Create figure
fig = go.Figure()
# Loop through each scenario
for i, scenario in enumerate(scenarios):
# Get dataframes for current scenario
dfs_scenario = df_dict[scenario]
# Get color for current scenario
color = f"hsl({(i/len(scenarios))*360}, 50%, 50%)"
# Loop through each replication of the current scenario
for j, df in enumerate(dfs_scenario):
# Get name for current replication
name = f"{scenario} - rep {j+1}"
fig.add_trace(go.Scatter(x=df["Cohort"], y=df["Edu_level"],
name=name,
mode="lines+markers",
line=dict(color=color),
marker=dict(symbol=int(scenario[-1]))))
# Show figure
fig.show()
# Create dictionary of dataframes
df_dict = {}
for i in range(1, num_reps+1):
key = f"rep {i}"
df = rdfs[i-1]
df_mean = df.groupby(['Step'], as_index=False).Education.mean()
df_dict[key] = df_mean
# Create list of replications
replications = [f"rep {i}" for i in range(1, num_reps+1)]
# Create figure
fig = go.Figure()
# Loop through each replication
for i, replication in enumerate(replications):
df_filtered = df_dict[replication]
fig.add_trace(go.Scatter(x=df_filtered["Step"], y=df_filtered["Education"],
name=replication,
mode="lines+markers",
marker=dict(symbol=i+1)))
# Show figure
fig.show()
import plotly.graph_objects as go
# Create dictionary of dataframes
df_dict = {}
row=0
for j in range(1, num_scen+1):
for i in range(1, num_reps+1):
key = f"scenario {j}"
df = dfs[row]
am1 = df[(df['Mother_Edu'] < 5) & (df['Father_Edu'] <5)]
counts = am1[(am1['Edu_level'] > am1['Mother_Edu']) & (am1['Edu_level'] > am1['Father_Edu'])].groupby('Cohort').size()
totals = am1.groupby(['Cohort']).size()
probs = counts/totals
if key in df_dict:
df_dict[key].append({'Cohort': probs.index, 'Probability': probs.values})
else:
df_dict[key] = [{'Cohort': probs.index, 'Probability': probs.values}]
row += 1
# Create list of replications and scenarios
replications = [f"rep {i}" for i in range(1, num_reps+1)]
scenarios = [f"scenario {j}" for j in range(1, num_scen+1)]
# Create figure
fig = go.Figure()
# Loop through each scenario
for i, scenario in enumerate(scenarios):
# Get dataframes for current scenario
dfs_scenario = df_dict[scenario]
# Get color for current scenario
color = f"hsl({(i/len(scenarios))*360}, 50%, 50%)"
# Loop through each replication of the current scenario
for j, df in enumerate(dfs_scenario):
# Get name for current replication
name = f"{scenario} - rep {j+1}"
fig.add_trace(go.Scatter(x=df["Cohort"], y=df["Probability"],
name=name,
mode="lines+markers",
line=dict(color=color),
marker=dict(symbol=int(j))))
# Show figure
fig.show()
absolute = pd.DataFrame(columns=["Scenario", "Replication", "Cohort", "Probability"])
row = 0
for j in range(1, num_scen+1):
for i in range(1, num_reps+1):
df = dfs[row]
am1 = df[(df['Mother_Edu'] < 5) & (df['Father_Edu'] <5)]
am2 = df[(df['Mother_Edu'] == 5) | (df['Father_Edu'] ==5)]
counts = am1[(am1['Edu_level'] > am1['Mother_Edu']) & (am1['Edu_level'] > am1['Father_Edu'])].groupby('Cohort').size()
counts2 = am2[(am2['Edu_level'] == 5)].groupby('Cohort').size()
totals = am1.groupby(['Cohort']).size()
totals2 = am2.groupby('Cohort').size()
probs = counts/totals
probs2 = counts2 / totals2
probs3 = probs2 + probs
temp = pd.DataFrame({"Scenario": j, "Replication": i, "Cohort": probs.index, "Probability": probs.values})
temp2 = pd.DataFrame({"Scenario": j, "Replication": i, "Cohort": probs3.index, "Probability": probs3.values})
absolute = pd.concat([absolute, temp], ignore_index=True)
absolute2 = pd.concat([absolute, temp2], ignore_index=True)
row +=1
# absolute.drop('Replication')
abs1 = absolute.groupby(['Scenario', 'Cohort']).agg({'Probability': ["mean", "median", "var", "sem", "std"]}).reset_index()
abs1['ci95_hi'] = abs1['Probability']['mean'] + 1.96* abs1['Probability']['sem']
abs1['ci95_lo'] = abs1['Probability']['mean'] - 1.96* abs1['Probability']['sem']
# abs1.columns = abs1.columns.droplevel()
abs1.columns = ['_'.join(col) for col in abs1.columns]
abs1.rename(columns = {'Scenario_':'Scenario', 'Cohort_':'Cohort', 'ci95_hi_': 'ci95_hi', 'ci95_lo_':'ci95_lo'}, inplace = True)
abs1
| Scenario | Cohort | Probability_mean | Probability_median | Probability_var | Probability_sem | Probability_std | ci95_hi | ci95_lo | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3 | 0.368278 | 0.372768 | 0.003509 | 0.018733 | 0.059239 | 0.404995 | 0.331562 |
| 1 | 1 | 4 | 0.394438 | 0.373190 | 0.008489 | 0.029136 | 0.092137 | 0.451545 | 0.337331 |
| 2 | 1 | 5 | 0.390056 | 0.390476 | 0.005815 | 0.024114 | 0.076255 | 0.437319 | 0.342792 |
| 3 | 1 | 6 | 0.405786 | 0.398630 | 0.004566 | 0.021367 | 0.067569 | 0.447666 | 0.363906 |
| 4 | 1 | 7 | 0.401345 | 0.385705 | 0.009663 | 0.031086 | 0.098303 | 0.462273 | 0.340416 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 197 | 3 | 64 | 0.500000 | 0.500000 | NaN | NaN | NaN | NaN | NaN |
| 198 | 3 | 65 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 199 | 3 | 66 | 0.500000 | 0.500000 | NaN | NaN | NaN | NaN | NaN |
| 200 | 3 | 67 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 201 | 3 | 69 | 0.500000 | 0.500000 | NaN | NaN | NaN | NaN | NaN |
202 rows × 9 columns
'''
To csv file for further analysis
'''
absolute.to_csv(csv_path/"absolute_measure1.csv")
absolute2.to_csv(csv_path/"absolute_measure2.csv")
abs1.to_csv(csv_path/"absolute_measure1_sum.csv")
abs2.to_csv(csv_path/"absolute_measure2_sum.csv")
import plotly.express as px
abs1["Scenario"] = abs1["Scenario"].astype(str)
fig = px.scatter(abs1, x='Cohort', y='Probability_mean', color='Scenario').update_traces(
error_y={'type': 'data',
'symmetric': False,
'array': abs1['ci95_hi']-abs1['Probability_mean'],
'arrayminus':abs1['Probability_mean']-abs1['ci95_lo']}
)
fig.show()
'''
Continuous Graph
'''
import plotly.graph_objects as go
abs1["Scenario"] = abs1["Scenario"].astype(str)
scenarios = abs1["Scenario"].unique()
# Define a color map for the scenarios
color_map = {
scenario: f"rgba({r}, {g}, {b}, 0.3)"
for scenario, (r, g, b) in zip(scenarios, [(99, 109, 250), (239, 85, 59), (0, 204, 150)])
}
fig = go.Figure()
for scenario in scenarios:
scenario_data = abs1[abs1["Scenario"] == scenario]
fig.add_trace(go.Scatter(x=scenario_data['Cohort'],
y=scenario_data['Probability_mean'],
name=f'Scenario {scenario}',
mode='lines',
line=dict(color=color_map[scenario].replace('0.3', '1')))
)
fig.add_trace(go.Scatter(x=scenario_data['Cohort'],
y=scenario_data['ci95_hi'],
name='Upper Bound of {}'.format(scenario),
mode='lines',
marker=dict(color="#444"),
line=dict(width=0),
showlegend=False))
fig.add_trace(go.Scatter(x=scenario_data['Cohort'],
y=scenario_data['ci95_lo'],
name='Lower Bound of {}'.format(scenario),
mode='lines',
marker=dict(color="#444"),
line=dict(width=0),
showlegend=False,
fillcolor=color_map[scenario],
fill='tonexty'))
fig.update_layout(
xaxis_title="Cohort",
yaxis_title="Mean Probability",
# paper_bgcolor='rgba(0,0,0,0)',
# plot_bgcolor = 'rgba(0,0,0,0)',
# showlegend=False
)
fig.show()
fig.write_image(project_path/"images/abs1_means2.png")
fig.write_html(project_path/"images/abs1_means.html") # with legend
# absolute.drop('Replication')
abs2 = absolute2.groupby(['Scenario', 'Cohort']).agg({'Probability': ["mean", "median", "var", "sem", "std"]}).reset_index()
abs2['ci95_hi'] = abs2['Probability']['mean'] + 1.96* abs2['Probability']['sem']
abs2['ci95_lo'] = abs2['Probability']['mean'] - 1.96* abs2['Probability']['sem']
abs2.columns = ['_'.join(col) for col in abs2.columns]
abs2.rename(columns = {'Scenario_':'Scenario', 'Cohort_':'Cohort', 'ci95_hi_': 'ci95_hi', 'ci95_lo_':'ci95_lo'}, inplace = True)
abs2
| Scenario | Cohort | Probability_mean | Probability_median | Probability_var | Probability_sem | Probability_std | ci95_hi | ci95_lo | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3 | 0.368278 | 0.372768 | 0.003509 | 0.018733 | 0.059239 | 0.404995 | 0.331562 |
| 1 | 1 | 4 | 0.394438 | 0.373190 | 0.008489 | 0.029136 | 0.092137 | 0.451545 | 0.337331 |
| 2 | 1 | 5 | 0.390056 | 0.390476 | 0.005815 | 0.024114 | 0.076255 | 0.437319 | 0.342792 |
| 3 | 1 | 6 | 0.405786 | 0.398630 | 0.004566 | 0.021367 | 0.067569 | 0.447666 | 0.363906 |
| 4 | 1 | 7 | 0.401345 | 0.385705 | 0.009663 | 0.031086 | 0.098303 | 0.462273 | 0.340416 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 197 | 3 | 64 | 0.500000 | 0.500000 | NaN | NaN | NaN | NaN | NaN |
| 198 | 3 | 65 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 199 | 3 | 66 | 0.500000 | 0.500000 | NaN | NaN | NaN | NaN | NaN |
| 200 | 3 | 67 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 201 | 3 | 69 | 0.500000 | 0.500000 | NaN | NaN | NaN | NaN | NaN |
202 rows × 9 columns
import plotly.express as px
abs2["Scenario"] = abs2["Scenario"].astype(str)
fig = px.scatter(abs2, x='Cohort', y='Probability_mean', color='Scenario').update_traces(
error_y={'type': 'data',
'symmetric': False,
'array': abs2['ci95_hi']-abs2['Probability_mean'],
'arrayminus':abs2['Probability_mean']-abs2['ci95_lo']}
)
fig.show()
'''
Continuous Graph
'''
import plotly.graph_objects as go
abs2["Scenario"] = abs2["Scenario"].astype(str)
scenarios = abs2["Scenario"].unique()
# Define a color map for the scenarios
color_map = {
scenario: f"rgba({r}, {g}, {b}, 0.3)"
for scenario, (r, g, b) in zip(scenarios, [(99, 109, 250), (239, 85, 59), (0, 204, 150)])
}
fig = go.Figure()
for scenario in scenarios:
scenario_data = abs2[abs2["Scenario"] == scenario]
fig.add_trace(go.Scatter(x=scenario_data['Cohort'],
y=scenario_data['Probability_mean'],
name=f'Scenario {scenario}',
mode='lines',
line=dict(color=color_map[scenario].replace('0.3', '1')))
)
fig.add_trace(go.Scatter(x=scenario_data['Cohort'],
y=scenario_data['ci95_hi'],
name='Upper Bound of {}'.format(scenario),
mode='lines',
marker=dict(color="#444"),
line=dict(width=0),
showlegend=False))
fig.add_trace(go.Scatter(x=scenario_data['Cohort'],
y=scenario_data['ci95_lo'],
name='Lower Bound of {}'.format(scenario),
mode='lines',
marker=dict(color="#444"),
line=dict(width=0),
showlegend=False,
fillcolor=color_map[scenario],
fill='tonexty'))
fig.update_layout(
xaxis_title="Cohort",
yaxis_title="Mean Probability",
# paper_bgcolor='rgba(0,0,0,0)',
# plot_bgcolor = 'rgba(0,0,0,0)',
# showlegend=False
)
fig.show()
fig.write_image(project_path/"images/abs2_means.png")
fig.write_html(project_path/"images/abs2_means.html") # with legend
import plotly.graph_objects as go
# Create dictionary of dataframes
df_dict = {}
row = 0
for j in range(1, num_scen+1):
for i in range(1, num_reps+1):
key = f"scenario {j}"
df = dfs[row]
am1 = df[(df['Mother_Edu'] < 5) & (df['Father_Edu'] <5)]
am2 = df[(df['Mother_Edu'] == 5) | (df['Father_Edu'] ==5)]
counts = am1[(am1['Edu_level'] > am1['Mother_Edu']) & (am1['Edu_level'] > am1['Father_Edu'])].groupby('Cohort').size()
counts2 = am2[(am2['Edu_level'] == 5)].groupby('Cohort').size()
totals = am1.groupby(['Cohort']).size()
totals2 = am2.groupby('Cohort').size()
probs = counts/totals
probs2 = counts2 / totals2
probs3 = probs2 + probs
if key in df_dict:
df_dict[key].append({'Cohort': probs.index, 'Probability': probs.values, 'Cohort2': probs2.index, 'Probability2': probs2.values,
'Probability3': probs3.values})
else:
df_dict[key] = [{'Cohort': probs.index, 'Probability': probs.values, 'Cohort2': probs2.index, 'Probability2': probs2.values,
'Probability3': probs3.values}]
row+=1
# Create list of replications and scenarios
replications = [f"rep {i}" for i in range(1, num_reps+1)]
scenarios = [f"scenario {j}" for j in range(1, num_scen+1)]
# Create figures
fig = go.Figure()
fig2 = go.Figure()
fig3 = go.Figure()
# Loop through each scenario
for i, scenario in enumerate(scenarios):
# Get dataframes for current scenario
dfs_scenario = df_dict[scenario]
# Get color for current scenario
color = f"hsl({(i/len(scenarios))*360}, 50%, 50%)"
# Loop through each replication of the current scenario
for j, df in enumerate(dfs_scenario):
# Get name for current replication
name = f"{scenario} - rep {j+1}"
# Add traces to figures
fig.add_trace(go.Scatter(x=df["Cohort"], y=df["Probability"],
name=name,
mode="lines+markers",
line=dict(color=color),
marker=dict(symbol=int(j))))
fig2.add_trace(go.Scatter(x=df["Cohort2"], y=df["Probability2"],
name=name,
mode="lines+markers",
line=dict(color=color),
marker=dict(symbol=int(j))))
fig3.add_trace(go.Scatter(x=df["Cohort"], y=df["Probability3"],
name=name,
mode="lines+markers",
line=dict(color=color),
marker=dict(symbol=int(j))))
# Show figures
fig.show()
fig2.show()
fig3.show()
dfs[0].columns
Index(['AgentID', 'Step', 'Age', 'Edu_level', 'Gender', 'Income', 'Generation',
'Cohort', 'Weight', 'Capital', 'Brut', 'Child1', 'Child2', 'Mother',
'Father', 'Partner', 'Education', 'Cultural', 'Economic', 'Social',
'Child1_Edu', 'Child2_Edu', 'Child1_Income', 'Child2_Income',
'Mother_Edu', 'Father_Edu', 'Mother_Income', 'Father_Income',
'Mother_Cultural', 'Mother_Economic', 'Mother_Social',
'Father_Cultural', 'Father_Economic', 'Father_Social', 'Scenario',
'Replication'],
dtype='object')
'''
Make a function to create summary tables for each data set
'''
def summary_tab(df):
ls = pd.DataFrame()
row = 0
for i in df.Cohort.unique():
cht = df[df['Cohort'] == i]
for j in range(1, 6):
for k in range(1, 6):
new = cht[(cht['Mother_Edu'] == j) | (cht['Father_Edu'] == k)]
n = new.size
avg = new['Edu_level'].mean()
med = new['Edu_level'].median()
ls.loc[row, 'Cohort'] = i
ls.loc[row, 'Average Edu'] = avg
ls.loc[row, 'Median Edu'] = med
ls.loc[row, 'Total'] = n
ls.loc[row, 'Mother Edu'] = j
ls.loc[row, 'Father Edu'] = k
row += 1
return ls
ls_full = pd.DataFrame()
row = 0
for i in range(1, num_scen+1):
for j in range(1,num_reps+1):
df = dfs[row]
df2 = df[df['Generation'] != 1]
df3 = summary_tab(df2)
df3['Scenario'] = i
df3['Replication'] = j
ls_full = pd.concat([ls_full, df3], ignore_index=True)
row +=1
# pd.set_option('display.max_rows', None)
ls_full
| Cohort | Average Edu | Median Edu | Total | Mother Edu | Father Edu | Scenario | Replication | |
|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 3.888889 | 4.5 | 648.0 | 1.0 | 1.0 | 1 | 1 |
| 1 | 7.0 | 3.954545 | 4.0 | 792.0 | 1.0 | 2.0 | 1 | 1 |
| 2 | 7.0 | 3.950000 | 4.0 | 1440.0 | 1.0 | 3.0 | 1 | 1 |
| 3 | 7.0 | 4.137255 | 4.0 | 1836.0 | 1.0 | 4.0 | 1 | 1 |
| 4 | 7.0 | 4.343750 | 5.0 | 2304.0 | 1.0 | 5.0 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 42270 | 65.0 | 5.000000 | 5.0 | 36.0 | 5.0 | 1.0 | 3 | 10 |
| 42271 | 65.0 | 5.000000 | 5.0 | 36.0 | 5.0 | 2.0 | 3 | 10 |
| 42272 | 65.0 | 5.000000 | 5.0 | 36.0 | 5.0 | 3.0 | 3 | 10 |
| 42273 | 65.0 | 5.000000 | 5.0 | 36.0 | 5.0 | 4.0 | 3 | 10 |
| 42274 | 65.0 | 5.000000 | 5.0 | 36.0 | 5.0 | 5.0 | 3 | 10 |
42275 rows × 8 columns
# pd.set_option("display.max_rows", 25)
agg_ls = ls_full.groupby(['Scenario', 'Cohort', 'Mother Edu', 'Father Edu']).agg({'Average Edu': ["mean", "sem", "std"], 'Median Edu': ['mean', 'sem', 'std'] }).reset_index()
agg_ls['ci95_hi_avg'] = agg_ls['Average Edu']['mean'] + 1.96* agg_ls['Average Edu']['sem']
agg_ls['ci95_lo_avg'] = agg_ls['Average Edu']['mean'] - 1.96* agg_ls['Average Edu']['sem']
agg_ls['ci95_hi_med'] = agg_ls['Median Edu']['mean'] + 1.96* agg_ls['Median Edu']['sem']
agg_ls['ci95_lo_med'] = agg_ls['Median Edu']['mean'] - 1.96* agg_ls['Median Edu']['sem']
agg_ls.columns = ['_'.join(col) for col in agg_ls.columns]
agg_ls.rename(columns = {'Scenario_':'Scenario', 'Cohort_':'Cohort', 'Mother Edu_':'Mother Edu', 'Father Edu_':'Father Edu',
'ci95_hi_avg_': 'ci95_hi_avg', 'ci95_lo_avg_':'ci95_lo_avg',
'ci95_hi_med_': 'ci95_hi_med', 'ci95_lo_med_':'ci95_lo_med'}, inplace = True)
agg_ls['Parent Edu'] = agg_ls.apply(lambda x: [x['Mother Edu'], x['Father Edu']], axis=1)
agg_ls.head(5)
| Scenario | Cohort | Mother Edu | Father Edu | Average Edu_mean | Average Edu_sem | Average Edu_std | Median Edu_mean | Median Edu_sem | Median Edu_std | ci95_hi_avg | ci95_lo_avg | ci95_hi_med | ci95_lo_med | Parent Edu | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3.0 | 1.0 | 1.0 | 1.805588 | 0.125877 | 0.398057 | 1.2 | 0.133333 | 0.421637 | 2.052307 | 1.558870 | 1.461333 | 0.938667 | [1.0, 1.0] |
| 1 | 1 | 3.0 | 1.0 | 2.0 | 2.405332 | 0.089751 | 0.283817 | 2.0 | 0.149071 | 0.471405 | 2.581244 | 2.229421 | 2.292180 | 1.707820 | [1.0, 2.0] |
| 2 | 1 | 3.0 | 1.0 | 3.0 | 3.160301 | 0.073470 | 0.232331 | 3.3 | 0.152753 | 0.483046 | 3.304301 | 3.016300 | 3.599395 | 3.000605 | [1.0, 3.0] |
| 3 | 1 | 3.0 | 1.0 | 4.0 | 3.436475 | 0.068618 | 0.216990 | 4.0 | 0.000000 | 0.000000 | 3.570967 | 3.301983 | 4.000000 | 4.000000 | [1.0, 4.0] |
| 4 | 1 | 3.0 | 1.0 | 5.0 | 3.553121 | 0.076782 | 0.242806 | 4.1 | 0.100000 | 0.316228 | 3.703614 | 3.402628 | 4.296000 | 3.904000 | [1.0, 5.0] |
import plotly.express as px
agg_ls["Parent Edu"] = agg_ls["Parent Edu"].astype(str)
data = agg_ls[agg_ls['Scenario'] == 1]
fig = px.line(data, x='Cohort', y='Median Edu_mean', color='Parent Edu')
# .update_traces(
# error_y={'type': 'data',
# 'symmetric': False,
# 'array': data['ci95_hi_med']-data['Median Edu_mean'],
# 'arrayminus':data['Median Edu_mean']-data['ci95_lo_med']}
# )
fig.show()
import plotly.express as px
agg_ls["Parent Edu"] = agg_ls["Parent Edu"].astype(str)
data = agg_ls[agg_ls['Scenario'] == 2]
fig = px.line(data, x='Cohort', y='Median Edu_mean', color='Parent Edu')
# .update_traces(
# error_y={'type': 'data',
# 'symmetric': False,
# 'array': data['ci95_hi_med']-data['Median Edu_mean'],
# 'arrayminus':data['Median Edu_mean']-data['ci95_lo_med']}
# )
fig.show()
import plotly.express as px
agg_ls["Parent Edu"] = agg_ls["Parent Edu"].astype(str)
data = agg_ls[agg_ls['Scenario'] == 3]
fig = px.line(data, x='Cohort', y='Median Edu_mean', color='Parent Edu')
# .update_traces(
# error_y={'type': 'data',
# 'symmetric': False,
# 'array': data['ci95_hi_med']-data['Median Edu_mean'],
# 'arrayminus':data['Median Edu_mean']-data['ci95_lo_med']}
# )
fig.show()
More mixing in scenario 3 compared to 1 and 2.